library(tidyverse)
## ── Attaching packages ──────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ─────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(janeaustenr)
austen_books() %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(book, word, sort = TRUE) ->
book_words
book_words %>%
group_by(book) %>%
summarize(total = sum(n)) ->
total_words
book_words %>%
left_join(total_words) ->
book_words
## Joining, by = "book"
book_words
## # A tibble: 39,708 x 4
## book word n total
## <fct> <chr> <int> <int>
## 1 Mansfield Park the 6209 160460
## 2 Mansfield Park to 5477 160460
## 3 Mansfield Park and 5439 160460
## 4 Emma to 5242 160996
## 5 Emma the 5204 160996
## 6 Emma and 4897 160996
## 7 Mansfield Park of 4778 160460
## 8 Pride & Prejudice the 4331 122204
## 9 Emma of 4293 160996
## 10 Pride & Prejudice to 4163 122204
## # … with 39,698 more rows
book_words %>%
ggplot(aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, .0009)+
facet_wrap(~book, ncol = 2, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 896 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
book_words %>%
filter(n ==1) %>%
nrow()
## [1] 15929
book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
term_frequency = n/total) ->
freq_by_rank
freq_by_rank
## # A tibble: 39,708 x 6
## # Groups: book [6]
## book word n total rank term_frequency
## <fct> <chr> <int> <int> <int> <dbl>
## 1 Mansfield Park the 6209 160460 1 0.0387
## 2 Mansfield Park to 5477 160460 2 0.0341
## 3 Mansfield Park and 5439 160460 3 0.0339
## 4 Emma to 5242 160996 1 0.0326
## 5 Emma the 5204 160996 2 0.0323
## 6 Emma and 4897 160996 3 0.0304
## 7 Mansfield Park of 4778 160460 4 0.0298
## 8 Pride & Prejudice the 4331 122204 1 0.0354
## 9 Emma of 4293 160996 4 0.0267
## 10 Pride & Prejudice to 4163 122204 2 0.0341
## # … with 39,698 more rows
freq_by_rank %>%
ggplot(aes(rank, term_frequency, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
freq_by_rank %>%
filter(rank < 500, rank > 10) ->
rank_subset
lmout <- lm(log10(term_frequency) ~ log10(rank), data = rank_subset)
broom::tidy(lmout)[c(1,2,5)]
## # A tibble: 2 x 3
## term estimate p.value
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.621 0
## 2 log10(rank) -1.11 0
freq_by_rank %>%
ggplot(aes(rank, term_frequency, color = book)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "red", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
book_words %>% bind_tf_idf(word, book, n) ->
book_words
book_words
## # A tibble: 39,708 x 7
## book word n total tf idf tf_idf
## <fct> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 Mansfield Park the 6209 160460 0.0387 0 0
## 2 Mansfield Park to 5477 160460 0.0341 0 0
## 3 Mansfield Park and 5439 160460 0.0339 0 0
## 4 Emma to 5242 160996 0.0326 0 0
## 5 Emma the 5204 160996 0.0323 0 0
## 6 Emma and 4897 160996 0.0304 0 0
## 7 Mansfield Park of 4778 160460 0.0298 0 0
## 8 Pride & Prejudice the 4331 122204 0.0354 0 0
## 9 Emma of 4293 160996 0.0267 0 0
## 10 Pride & Prejudice to 4163 122204 0.0341 0 0
## # … with 39,698 more rows
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 39,708 x 6
## book word n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Sense & Sensibility elinor 623 0.00519 1.79 0.00931
## 2 Sense & Sensibility marianne 492 0.00410 1.79 0.00735
## 3 Mansfield Park crawford 493 0.00307 1.79 0.00551
## 4 Pride & Prejudice darcy 374 0.00306 1.79 0.00548
## 5 Persuasion elliot 254 0.00304 1.79 0.00544
## 6 Emma emma 786 0.00488 1.10 0.00536
## 7 Northanger Abbey tilney 196 0.00252 1.79 0.00452
## 8 Emma weston 389 0.00242 1.79 0.00433
## 9 Pride & Prejudice bennet 294 0.00241 1.79 0.00431
## 10 Persuasion wentworth 191 0.00228 1.79 0.00409
## # … with 39,698 more rows
book_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
library(gutenbergr)
physics <- gutenberg_download(c(37729, 14725, 13476, 30155), meta_fields = "author")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
physics %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word, sort = TRUE) ->
physics_words
physics_words
## # A tibble: 11,849 x 3
## author word n
## <chr> <chr> <int>
## 1 Galilei, Galileo the 3770
## 2 Tesla, Nikola the 3606
## 3 Huygens, Christiaan the 3553
## 4 Einstein, Albert the 2995
## 5 Galilei, Galileo of 2051
## 6 Einstein, Albert of 2028
## 7 Tesla, Nikola of 1737
## 8 Huygens, Christiaan of 1708
## 9 Huygens, Christiaan to 1207
## 10 Tesla, Nikola a 1176
## # … with 11,839 more rows
physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(word = fct_reorder(word, tf_idf)) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert"))) ->
physics_plot
physics_plot %>%
group_by(author) %>%
top_n(15, tf_idf) %>%
ungroup() %>%
mutate(word = reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip()
physics %>%
filter(str_detect(text, "RC")) %>%
select(text)
## # A tibble: 44 x 1
## text
## <chr>
## 1 line RC, parallel and equal to AB, to be a portion of a wave of light,
## 2 represents the partial wave coming from the point A, after the wave RC
## 3 be the propagation of the wave RC which fell on AB, and would be the
## 4 transparent body; seeing that the wave RC, having come to the aperture
## 5 incident rays. Let there be such a ray RC falling upon the surface
## 6 CK. Make CO perpendicular to RC, and across the angle KCO adjust OK,
## 7 the required refraction of the ray RC. The demonstration of this is,
## 8 explaining ordinary refraction. For the refraction of the ray RC is
## 9 29. Now as we have found CI the refraction of the ray RC, similarly
## 10 the ray _r_C is inclined equally with RC, the line C_d_ will
## # … with 34 more rows
mystopwords <- tibble(word = c("eq", "co", "rc", "ac", "ak", "bn",
"fig", "file", "cg", "cb", "cm",
"ab"))
physics_words <- anti_join(physics_words, mystopwords,
by = "word")
plot_physics <- physics_words %>%
bind_tf_idf(word, author, n) %>%
mutate(word = str_remove_all(word, "_")) %>%
group_by(author) %>%
top_n(15, tf_idf) %>%
ungroup() %>%
mutate(word = reorder_within(word, tf_idf, author)) %>%
mutate(author = factor(author, levels = c("Galilei, Galileo",
"Huygens, Christiaan",
"Tesla, Nikola",
"Einstein, Albert")))
ggplot(plot_physics, aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
# ngrams
austen_books() %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) ->
austen_bigrams
austen_bigrams
## # A tibble: 725,049 x 2
## book bigram
## <fct> <chr>
## 1 Sense & Sensibility sense and
## 2 Sense & Sensibility and sensibility
## 3 Sense & Sensibility sensibility by
## 4 Sense & Sensibility by jane
## 5 Sense & Sensibility jane austen
## 6 Sense & Sensibility austen 1811
## 7 Sense & Sensibility 1811 chapter
## 8 Sense & Sensibility chapter 1
## 9 Sense & Sensibility 1 the
## 10 Sense & Sensibility the family
## # … with 725,039 more rows
austen_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 211,236 x 2
## bigram n
## <chr> <int>
## 1 of the 3017
## 2 to be 2787
## 3 in the 2368
## 4 it was 1781
## 5 i am 1545
## 6 she had 1472
## 7 of her 1445
## 8 to the 1387
## 9 she was 1377
## 10 had been 1299
## # … with 211,226 more rows
austen_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")->
bigrams_separated
bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) ->
bigrams_filtered
# new bigram counts:
bigrams_filtered %>%
count(word1, word2, sort = TRUE)->
bigram_counts
bigram_counts
## # A tibble: 33,421 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 sir thomas 287
## 2 miss crawford 215
## 3 captain wentworth 170
## 4 miss woodhouse 162
## 5 frank churchill 132
## 6 lady russell 118
## 7 lady bertram 114
## 8 sir walter 113
## 9 miss fairfax 109
## 10 colonel brandon 108
## # … with 33,411 more rows
bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")->
bigrams_united
bigrams_united %>%
count(bigram, sort = TRUE)
## # A tibble: 33,421 x 2
## bigram n
## <chr> <int>
## 1 sir thomas 287
## 2 miss crawford 215
## 3 captain wentworth 170
## 4 miss woodhouse 162
## 5 frank churchill 132
## 6 lady russell 118
## 7 lady bertram 114
## 8 sir walter 113
## 9 miss fairfax 109
## 10 colonel brandon 108
## # … with 33,411 more rows
bigrams_filtered %>%
filter(word2 == "street") %>%
count(book, word1, sort = TRUE)
## # A tibble: 34 x 3
## book word1 n
## <fct> <chr> <int>
## 1 Sense & Sensibility berkeley 16
## 2 Sense & Sensibility harley 16
## 3 Northanger Abbey pulteney 14
## 4 Northanger Abbey milsom 11
## 5 Mansfield Park wimpole 10
## 6 Pride & Prejudice gracechurch 9
## 7 Sense & Sensibility conduit 6
## 8 Sense & Sensibility bond 5
## 9 Persuasion milsom 5
## 10 Persuasion rivers 4
## # … with 24 more rows
#or
bigrams_united %>%
filter(str_detect(bigram,"street")) %>%
count(book, bigram, sort = TRUE)
## # A tibble: 57 x 3
## book bigram n
## <fct> <chr> <int>
## 1 Sense & Sensibility berkeley street 16
## 2 Sense & Sensibility harley street 16
## 3 Northanger Abbey pulteney street 14
## 4 Northanger Abbey milsom street 11
## 5 Mansfield Park wimpole street 10
## 6 Pride & Prejudice gracechurch street 9
## 7 Sense & Sensibility conduit street 6
## 8 Sense & Sensibility bond street 5
## 9 Persuasion milsom street 5
## 10 Persuasion rivers street 4
## # … with 47 more rows
bigrams_united %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf)) ->
bigram_tf_idf
bigram_tf_idf
## # A tibble: 36,217 x 6
## book bigram n tf idf tf_idf
## <fct> <chr> <int> <dbl> <dbl> <dbl>
## 1 Persuasion captain wentworth 170 0.0299 1.79 0.0535
## 2 Mansfield Park sir thomas 287 0.0287 1.79 0.0515
## 3 Mansfield Park miss crawford 215 0.0215 1.79 0.0386
## 4 Persuasion lady russell 118 0.0207 1.79 0.0371
## 5 Persuasion sir walter 113 0.0198 1.79 0.0356
## 6 Emma miss woodhouse 162 0.0170 1.79 0.0305
## 7 Northanger Abbey miss tilney 82 0.0159 1.79 0.0286
## 8 Sense & Sensibility colonel brandon 108 0.0150 1.79 0.0269
## 9 Emma frank churchill 132 0.0139 1.79 0.0248
## 10 Pride & Prejudice lady catherine 100 0.0138 1.79 0.0247
## # … with 36,207 more rows
bigram_tf_idf %>%
group_by(book) %>%
top_n(10, tf_idf) %>%
ungroup() %>%
mutate(bigram = reorder_within(bigram, tf_idf, book)) %>%
ggplot(aes(bigram, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 1,246 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not be 610
## 2 not to 355
## 3 not have 327
## 4 not know 252
## 5 not a 189
## 6 not think 176
## 7 not been 160
## 8 not the 147
## 9 not at 129
## 10 not in 118
## # … with 1,236 more rows
AFINN <- get_sentiments("afinn")
AFINN
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, value, sort = TRUE) ->
not_words
not_words
## # A tibble: 245 x 3
## word2 value n
## <chr> <dbl> <int>
## 1 like 2 99
## 2 help 2 82
## 3 want 1 45
## 4 wish 1 39
## 5 allow 1 36
## 6 care 2 23
## 7 sorry -1 21
## 8 leave -1 18
## 9 pretend -1 18
## 10 worth 2 17
## # … with 235 more rows
not_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(word2, n * value, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by \"not\"") +
ylab("Sentiment value * number of occurrences") +
coord_flip()
negation_words <- c("not", "no", "never", "without")
bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, value, sort = TRUE) ->
negated_words
negated_words %>%
mutate(contribution = n * value) %>%
group_by(word1) %>%
top_n(12, abs(contribution)) %>%
ungroup() %>%
ggplot(aes(reorder_within(word2,contribution, word1), n * value,
fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words preceded by negation term") +
ylab("Sentiment value * Number of Occurrences") +
coord_flip() +
facet_wrap(~word1, scales = "free") +
scale_x_discrete(labels = function(x) str_replace(x,"__.+$", ""))
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
bigram_counts
## # A tibble: 33,421 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 sir thomas 287
## 2 miss crawford 215
## 3 captain wentworth 170
## 4 miss woodhouse 162
## 5 frank churchill 132
## 6 lady russell 118
## 7 lady bertram 114
## 8 sir walter 113
## 9 miss fairfax 109
## 10 colonel brandon 108
## # … with 33,411 more rows
bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()->
bigram_graph
bigram_graph
## IGRAPH bd7ff33 DN-- 91 77 --
## + attr: name (v/c), n (e/n)
## + edges from bd7ff33 (vertex names):
## [1] sir ->thomas miss ->crawford captain ->wentworth
## [4] miss ->woodhouse frank ->churchill lady ->russell
## [7] lady ->bertram sir ->walter miss ->fairfax
## [10] colonel ->brandon miss ->bates lady ->catherine
## [13] sir ->john jane ->fairfax miss ->tilney
## [16] lady ->middleton miss ->bingley thousand->pounds
## [19] miss ->dashwood miss ->bennet john ->knightley
## [22] miss ->morland captain ->benwick dear ->miss
## + ... omitted several edges
library(ggraph)
set.seed(17)
ggraph(bigram_graph, layout = "fr") + #The layout_with_fr() Fruchterman-Reingold layout
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
theme_void()
# With repel = TRUE
ggraph(bigram_graph, layout = "fr") + # layout_with_fr() uses the
# Fruchterman-Reingold layout
geom_edge_link() +
#geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1, repel = TRUE) +
theme_void()
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
data("AssociatedPress", package = "topicmodels")
AssociatedPress
## <<DocumentTermMatrix (documents: 2246, terms: 10473)>>
## Non-/sparse entries: 302031/23220327
## Sparsity : 99%
## Maximal term length: 18
## Weighting : term frequency (tf)
ap_td <- tidy(AssociatedPress)
ap_td
## # A tibble: 302,031 x 3
## document term count
## <int> <chr> <dbl>
## 1 1 adding 1
## 2 1 adult 2
## 3 1 ago 1
## 4 1 alcohol 1
## 5 1 allegedly 1
## 6 1 allen 1
## 7 1 apparently 2
## 8 1 appeared 1
## 9 1 arrested 1
## 10 1 assault 1
## # … with 302,021 more rows
ap_td %>%
inner_join(get_sentiments("bing"), by = c(term = "word"))->
ap_sentiments
ap_sentiments
## # A tibble: 30,094 x 4
## document term count sentiment
## <int> <chr> <dbl> <chr>
## 1 1 assault 1 negative
## 2 1 complex 1 negative
## 3 1 death 1 negative
## 4 1 died 1 negative
## 5 1 good 2 positive
## 6 1 illness 1 negative
## 7 1 killed 2 negative
## 8 1 like 2 positive
## 9 1 liked 1 positive
## 10 1 miracle 1 positive
## # … with 30,084 more rows
ap_sentiments %>%
count(sentiment, term, wt = count) %>%
ungroup() %>%
filter(n >= 200) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n, fill = sentiment)) +
geom_bar(stat = "identity") +
ylab("Contribution to sentiment") +
coord_flip()
data("data_corpus_inaugural", package = "quanteda")
inaug_dfm <- quanteda::dfm(data_corpus_inaugural, verbose = FALSE)
inaug_dfm
## Document-feature matrix of: 58 documents, 9,360 features (91.8% sparse) and 4 docvars.
## features
## docs fellow-citizens of the senate and house representatives :
## 1789-Washington 1 71 116 1 48 2 2 1
## 1793-Washington 0 11 13 0 2 0 0 1
## 1797-Adams 3 140 163 1 130 0 2 0
## 1801-Jefferson 2 104 130 0 81 0 0 1
## 1805-Jefferson 0 101 143 0 93 0 0 0
## 1809-Madison 1 69 104 0 43 0 0 0
## features
## docs among vicissitudes
## 1789-Washington 1 1
## 1793-Washington 0 0
## 1797-Adams 4 0
## 1801-Jefferson 1 0
## 1805-Jefferson 7 0
## 1809-Madison 0 0
## [ reached max_ndoc ... 52 more documents, reached max_nfeat ... 9,350 more features ]
inaug_td <- tidy(inaug_dfm)
inaug_td
## # A tibble: 44,710 x 3
## document term count
## <chr> <chr> <dbl>
## 1 1789-Washington fellow-citizens 1
## 2 1797-Adams fellow-citizens 3
## 3 1801-Jefferson fellow-citizens 2
## 4 1809-Madison fellow-citizens 1
## 5 1813-Madison fellow-citizens 1
## 6 1817-Monroe fellow-citizens 5
## 7 1821-Monroe fellow-citizens 1
## 8 1841-Harrison fellow-citizens 11
## 9 1845-Polk fellow-citizens 1
## 10 1849-Taylor fellow-citizens 1
## # … with 44,700 more rows
inaug_tf_idf <- inaug_td %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
inaug_tf_idf
## # A tibble: 44,710 x 6
## document term count tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1793-Washington arrive 1 0.00680 4.06 0.0276
## 2 1793-Washington upbraidings 1 0.00680 4.06 0.0276
## 3 1793-Washington violated 1 0.00680 3.37 0.0229
## 4 1793-Washington willingly 1 0.00680 3.37 0.0229
## 5 1793-Washington incurring 1 0.00680 3.37 0.0229
## 6 1793-Washington previous 1 0.00680 2.96 0.0201
## 7 1793-Washington knowingly 1 0.00680 2.96 0.0201
## 8 1793-Washington injunctions 1 0.00680 2.96 0.0201
## 9 1793-Washington witnesses 1 0.00680 2.96 0.0201
## 10 1793-Washington besides 1 0.00680 2.67 0.0182
## # … with 44,700 more rows
inaug_tf_idf %>%
filter(document %in% c("1861-Lincoln", "1933-Roosevelt","1961-Kennedy","2009-Obama")) %>%
mutate(term = str_extract(term, "[a-z']+")) %>%
group_by(document) %>%
arrange(desc(tf-idf)) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(x=reorder_within(term, tf_idf, document),
y=tf_idf, fill = document)) +
geom_col(show.legend = FALSE)+
facet_wrap(~document, scales = "free") +
coord_flip() +
scale_x_discrete(labels = function(x) str_replace(x,"__.+$", ""))
## Selecting by tf_idf
ap_td %>%
cast_dtm(document, term, count)
## <<DocumentTermMatrix (documents: 2246, terms: 10473)>>
## Non-/sparse entries: 302031/23220327
## Sparsity : 99%
## Maximal term length: 18
## Weighting : term frequency (tf)
ap_td %>%
cast_dfm(document, term, count)
## Document-feature matrix of: 2,246 documents, 10,473 features (98.7% sparse).
## features
## docs adding adult ago alcohol allegedly allen apparently appeared arrested
## 1 1 2 1 1 1 1 2 1 1
## 2 0 0 0 0 0 0 0 1 0
## 3 0 0 1 0 0 0 0 1 0
## 4 0 0 3 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 2 0 0 0 0 0 0
## features
## docs assault
## 1 1
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## [ reached max_ndoc ... 2,240 more documents, reached max_nfeat ... 10,463 more features ]
data("acq")
acq
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 50
acq_td <- tidy(acq)
acq_td
## # A tibble: 50 x 16
## author datetimestamp description heading id language origin topics
## <chr> <dttm> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 <NA> 1987-02-26 10:18:06 "" COMPUT… 10 en Reute… YES
## 2 <NA> 1987-02-26 10:19:15 "" OHIO M… 12 en Reute… YES
## 3 <NA> 1987-02-26 10:49:56 "" MCLEAN… 44 en Reute… YES
## 4 By Ca… 1987-02-26 10:51:17 "" CHEMLA… 45 en Reute… YES
## 5 <NA> 1987-02-26 11:08:33 "" <COFAB… 68 en Reute… YES
## 6 <NA> 1987-02-26 11:32:37 "" INVEST… 96 en Reute… YES
## 7 By Pa… 1987-02-26 11:43:13 "" AMERIC… 110 en Reute… YES
## 8 <NA> 1987-02-26 11:59:25 "" HONG K… 125 en Reute… YES
## 9 <NA> 1987-02-26 12:01:28 "" LIEBER… 128 en Reute… YES
## 10 <NA> 1987-02-26 12:08:27 "" GULF A… 134 en Reute… YES
## # … with 40 more rows, and 8 more variables: lewissplit <chr>, cgisplit <chr>,
## # oldid <chr>, places <named list>, people <lgl>, orgs <lgl>,
## # exchanges <lgl>, text <chr>
acq_tokens <- acq_td %>%
select(-places) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")
# most common words
acq_tokens %>%
count(word, sort = TRUE)
## # A tibble: 1,566 x 2
## word n
## <chr> <int>
## 1 dlrs 100
## 2 pct 70
## 3 mln 65
## 4 company 63
## 5 shares 52
## 6 reuter 50
## 7 stock 46
## 8 offer 34
## 9 share 34
## 10 american 28
## # … with 1,556 more rows
# tf-idf
acq_tokens %>%
count(id, word) %>%
bind_tf_idf(word, id, n) %>%
arrange(desc(tf_idf))
## # A tibble: 2,853 x 6
## id word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 186 groupe 2 0.133 3.91 0.522
## 2 128 liebert 3 0.130 3.91 0.510
## 3 474 esselte 5 0.109 3.91 0.425
## 4 371 burdett 6 0.103 3.91 0.405
## 5 442 hazleton 4 0.103 3.91 0.401
## 6 199 circuit 5 0.102 3.91 0.399
## 7 162 suffield 2 0.1 3.91 0.391
## 8 498 west 3 0.1 3.91 0.391
## 9 441 rmj 8 0.121 3.22 0.390
## 10 467 nursery 3 0.0968 3.91 0.379
## # … with 2,843 more rows
unloadNamespace("tidytext")
unloadNamespace("ggraph")
unloadNamespace("tidygraph")
unloadNamespace("igraph")
unloadNamespace("tm")
unloadNamespace("gutenbergr")
unloadNamespace("NLP")
unloadNamespace("janeaustenr")
search()
## [1] ".GlobalEnv" "package:forcats" "package:stringr"
## [4] "package:dplyr" "package:purrr" "package:readr"
## [7] "package:tidyr" "package:tibble" "package:ggplot2"
## [10] "package:tidyverse" "package:stats" "package:graphics"
## [13] "package:grDevices" "package:utils" "package:datasets"
## [16] "package:methods" "Autoloads" "package:base"
library(tidyverse)
library(rnaturalearth)
library(rnaturalearthdata)
library(rgeos)
## Loading required package: sp
## rgeos version: 0.5-2, (SVN revision 621)
## GEOS runtime version: 3.7.2-CAPI-1.11.2
## Linking to sp version: 1.3-1
## Polygon checking: TRUE
library(ggspatial)
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(lwgeom)
## Linking to liblwgeom 3.0.0beta1 r16016, GEOS 3.7.2, PROJ 5.2.0
theme_set(theme_bw())
world <- ne_countries(scale = "medium", returnclass = "sf")
class(world)
## [1] "sf" "data.frame"
ggplot(data = world) +
geom_sf() +
xlab("Longitude") + ylab("Latitude") +
ggtitle("World map",
subtitle = paste0("(", length(unique(world$name)), " countries)"))
ggplot(data = world) +
geom_sf(aes(fill = pop_est)) +
scale_fill_viridis_c(option = "plasma", trans = "sqrt")
ggplot(data = world) +
geom_sf() +
coord_sf(xlim = c(-102.15, -74.12), ylim = c(7.65, 33.97), expand = FALSE)
ggplot(data = world) +
geom_sf(aes(fill = gdp_md_est)) +
coord_sf(xlim = c(70, 150), ylim = c(15, 55), expand = FALSE) +
scale_fill_viridis_c(option = "plasma", trans = "sqrt")
world_points<- st_centroid(world)
## Warning in st_centroid.sf(world): st_centroid assumes attributes are constant
## over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
world_points <- cbind(world, st_coordinates(st_centroid(world$geometry)))
## Warning in st_centroid.sfc(world$geometry): st_centroid does not give correct
## centroids for longitude/latitude data
ggplot(data = world) +
geom_sf() +
geom_text(data= world_points,aes(x=X, y=Y, label=name),
color = "darkblue", fontface = "bold", check_overlap = FALSE) +
annotate(geom = "text", x = -90, y = 26, label = "Gulf of Mexico",
fontface = "italic", color = "grey22", size = 6) +
coord_sf(xlim = c(-97.15, -70.12), ylim = c(7.65, 30.97), expand = FALSE)
ggplot(data = world) +
geom_sf(fill= "antiquewhite") +
geom_text(data= world_points,aes(x=X, y=Y, label=name),
color = "darkblue", fontface = "bold", check_overlap = FALSE) +
annotate(geom = "text", x = -90, y = 26, label = "Gulf of Mexico",
fontface = "italic", color = "grey22", size = 6) +
annotation_scale(location = "bl", width_hint = 0.5) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
style = north_arrow_fancy_orienteering) +
coord_sf(xlim = c(-102.15, -74.12), ylim = c(7.65, 33.97), expand = FALSE) +
xlab("Longitude") + ylab("Latitude") +
ggtitle("Map of the Gulf of Mexico and the Caribbean Sea") +
theme(panel.grid.major = element_line(color = gray(.5),
linetype = "dashed", size = 0.5),
panel.background = element_rect(fill = "aliceblue"))
## Scale on map varies by more than 10%, scale bar may be inaccurate
sites <- data.frame(longitude = c(-80.144005, -80.109), latitude = c(26.479005,
26.83))
sites <- st_as_sf(sites, coords = c("longitude", "latitude"),
crs = 4326, agr = "constant")
ggplot(data = world) +
geom_sf() +
geom_sf(data = sites, size = 4, shape = 23, fill = "darkred") +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)
ggplot(data = world) +
geom_sf() +
geom_sf(data = sites, size = 4, shape = 23, fill = "darkred") +
coord_sf(xlim = c(-82, -78), ylim = c(24.5, 28), expand = FALSE)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
states <- st_as_sf(map("state", plot = FALSE, fill = TRUE))
head(states)
## Simple feature collection with 6 features and 1 field
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.3834 ymin: 30.24071 xmax: -71.78015 ymax: 42.04937
## CRS: EPSG:4326
## ID geom
## 1 alabama MULTIPOLYGON (((-87.46201 3...
## 2 arizona MULTIPOLYGON (((-114.6374 3...
## 3 arkansas MULTIPOLYGON (((-94.05103 3...
## 4 california MULTIPOLYGON (((-120.006 42...
## 5 colorado MULTIPOLYGON (((-102.0552 4...
## 6 connecticut MULTIPOLYGON (((-73.49902 4...
states <- cbind(states, st_coordinates(st_centroid(states)))
## Warning in st_centroid.sf(states): st_centroid assumes attributes are constant
## over geometries of x
## Warning in st_centroid.sfc(st_geometry(x), of_largest_polygon =
## of_largest_polygon): st_centroid does not give correct centroids for longitude/
## latitude data
states$ID <- stringr::str_to_title(states$ID)
ggplot(data = world) +
geom_sf() +
geom_sf(data = states, fill = NA) +
geom_text(data = states, aes(X, Y, label = ID), size = 5) +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)
counties <- st_as_sf(map("county", plot = FALSE, fill = TRUE))
counties <- subset(counties, grepl("florida", counties$ID))
counties$area <- as.numeric(st_area(counties))
head(counties)
## Simple feature collection with 6 features and 2 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -85.98951 ymin: 25.94926 xmax: -80.08804 ymax: 30.57303
## CRS: EPSG:4326
## ID geom area
## 290 florida,alachua MULTIPOLYGON (((-82.66062 2... 2498863359
## 291 florida,baker MULTIPOLYGON (((-82.04182 3... 1542466064
## 292 florida,bay MULTIPOLYGON (((-85.40509 3... 1946587533
## 293 florida,bradford MULTIPOLYGON (((-82.4257 29... 818898090
## 294 florida,brevard MULTIPOLYGON (((-80.94747 2... 2189682999
## 295 florida,broward MULTIPOLYGON (((-80.89018 2... 3167386973
ggplot(data = world) +
geom_sf() +
geom_sf(data = counties, aes(fill = area)) +
scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)
flcities <- data.frame(state = rep("Florida", 5), city = c("Miami",
"Tampa", "Orlando", "Jacksonville", "Sarasota"), lat = c(25.7616798,
27.950575, 28.5383355, 30.3321838, 27.3364347), lng = c(-80.1917902,
-82.4571776, -81.3792365, -81.655651, -82.5306527))
(flcities <- st_as_sf(flcities, coords = c("lng", "lat"), remove = FALSE,
crs = 4326, agr = "constant"))
## Simple feature collection with 5 features and 4 fields
## Attribute-geometry relationship: 4 constant, 0 aggregate, 0 identity
## geometry type: POINT
## dimension: XY
## bbox: xmin: -82.53065 ymin: 25.76168 xmax: -80.19179 ymax: 30.33218
## CRS: EPSG:4326
## state city lat lng geometry
## 1 Florida Miami 25.76168 -80.19179 POINT (-80.19179 25.76168)
## 2 Florida Tampa 27.95058 -82.45718 POINT (-82.45718 27.95058)
## 3 Florida Orlando 28.53834 -81.37924 POINT (-81.37924 28.53834)
## 4 Florida Jacksonville 30.33218 -81.65565 POINT (-81.65565 30.33218)
## 5 Florida Sarasota 27.33643 -82.53065 POINT (-82.53065 27.33643)
ggplot(data = world) +
geom_sf() +
geom_sf(data = counties, fill = NA, color = gray(.5)) +
geom_sf(data = flcities) +
geom_text(data = flcities, aes(x = lng, y = lat, label = city),
size = 3.9, col = "black", fontface = "bold") +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)
library("ggrepel")
ggplot(data = world) +
geom_sf() +
geom_sf(data = counties, fill = NA, color = gray(.5)) +
geom_sf(data = flcities) +
geom_text_repel(data = flcities, aes(x = lng, y = lat, label = city),
fontface = "bold", nudge_x = c(1, -1.5, 2, 2, -1), nudge_y = c(0.25,
-0.25, 0.5, 0.5, -0.5)) +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE)
states$nudge_y <- -1
states$nudge_y[states$ID == "Florida"] <- 1.1
states$nudge_y[states$ID == "South Carolina"] <- .1
ggplot(data = world) +
geom_sf(fill = "antiquewhite1") +
geom_sf(data = counties, aes(fill = area)) +
geom_sf(data = states, fill = NA) +
geom_sf(data = sites, size = 4, shape = 23, fill = "darkred") +
geom_sf(data = flcities) +
geom_text_repel(data = flcities, aes(x = lng, y = lat, label = city),
fontface = "bold", nudge_x = c(1, -1.5, 2, 2, -1), nudge_y = c(0.25,
-0.25, 0.5, 0.5, -0.5)) +
geom_label(data = states, aes(X, Y, label = ID), size = 5, fontface = "bold",
nudge_y = states$nudge_y) +
scale_fill_viridis_c(trans = "sqrt", alpha = .4) +
annotation_scale(location = "bl", width_hint = 0.4) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.75, "in"), pad_y = unit(0.5, "in"),
style = north_arrow_fancy_orienteering) +
coord_sf(xlim = c(-88, -78), ylim = c(24.5, 33), expand = FALSE) +
xlab("Longitude") + ylab("Latitude") +
ggtitle("Observation Sites", subtitle = "(2 sites in Palm Beach County, Florida)") +
theme(panel.grid.major = element_line(color = gray(0.5), linetype = "dashed",
size = 0.5), panel.background = element_rect(fill = "aliceblue"))
unloadNamespace("maps")
library(usmap)
data(statepop)
states <- read_csv("./data/state_geo_data.csv", col_types = cols(OID = col_character()))
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
df <- tibble(file_names = c("time_series_covid19_confirmed_US.csv",
"time_series_covid19_deaths_US.csv"))
df %>%
mutate(url = str_c(url_in, file_names, sep =""),
data = map(url, ~read_csv(., col_types = cols(FIPS = col_factor()), na = "")),
case_type = as.factor(str_extract(file_names,"[:alpha:]*_[gU][:alpha:]*"))) %>%
select(case_type, data) ->
df
df$vars <- map(df$data, names)
#create a helper function
fix_names <- function(df, pattern, repl){
stopifnot(is.data.frame(df), is.character(pattern), is.character(repl))
names(df) <- str_replace_all(names(df), pattern, repl)
return(df)
}
df %>%
mutate(data = map(data, ~ fix_names(., "Admin2", "County")),
data = map(data, ~ fix_names(., "Long_", "Long")),
data = map_if(data, !str_detect(df$case_type,"deaths_US"),
~ mutate(., Population = 0)),
data = map(data, ~filter(., !is.na(FIPS))),
data = map(data, ~ mutate(., cFIPS = str_extract(UID,".{5}$"),
sFIPS = str_extract(cFIPS, "^..")))
)->df
df$vars <- map(df$data, names)
df %>%
mutate(data = map(data, ~ pivot_longer(data = ., cols = contains("/"),
names_to = "Date",
values_to = "Daily_Total") ))->
df
#helper function for Date Change
fix_date_mdy <- function(df,var_name){
stopifnot(is.data.frame(df), is.character(var_name))
df[[var_name]]<- mdy(df[[var_name]])
return(df)
}
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
df %>%
mutate(data = map(data, ~fix_date_mdy(., "Date")))->
df
df %>%
unnest(cols=data) %>%
ungroup() ->
df_all
# rm(df)
df_all <- select(df_all, -vars)
df_all %>%
group_by(UID) %>%
filter(!is.na(cFIPS), County !="Unassigned", !str_detect(County, "^Out of ")) %>%
mutate(Population = max(Population),
Daily_Total_percap = Daily_Total/Population)->
df_all
#write_csv(df_all, "./data/covid_us_data.csv")
plot_usmap(regions = "states") +
labs(title = "US States",
subtitle = "This is a blank map of the states of the United States.") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"))
plot_usmap(include = .south_atlantic, labels = TRUE) +
labs(title = "US South Atlantic States",
subtitle = "This is a blank map of the states in the South Atlantic Census Region") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"))
plot_usmap(regions = "counties") +
labs(title = "US Counties",
subtitle = "This is a blank map of the counties of the United States.") +
theme(panel.background = element_rect(color = "black", fill = "lightblue"))
plot_usmap("counties", include = c("VA", "MD", "DC")) +
labs(title = "Custom DMV Area",
subtitle = "These are the counties in MD, DC< anD VA.")+
theme(panel.background = element_rect(color = "blue", fill = "lightblue"))
dmv_county_fips <- c("11001","24003","24009","24017","24021","24027","24031",
"24033","51013","51059","51061","51107","51153","51179",
"51510","51600","51610","51630","51683","51685")
plot_usmap("counties", include = dmv_county_fips) +
labs(title = "Greater DMV Area",
subtitle = "These are the Counties/Cities in the Greater DMV area.")+
theme(panel.background = element_rect(color = "blue", fill = "lightblue"))
df_all %>%
filter(Date == max(Date), Population >0, !is.na(sFIPS)) %>%
group_by(sFIPS, case_type, Date) %>%
summarise(Current_Total = sum(Daily_Total),
Current_Total_percap = sum(Daily_Total)/sum(Population))->
state_totals
state_totals %>%
filter(case_type =="confirmed_US") %>%
rename(fips = sFIPS)->
state_cases
plot_usmap(data = state_cases, values = "Current_Total", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Confirmed Cases", label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Total Cases by State as of ",
max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
state_totals %>%
filter(case_type =="deaths_US") %>%
rename(fips = sFIPS)->
state_cases
plot_usmap(data = state_cases, values = "Current_Total", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Total Deaths", label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Total Deaths by State as of ", max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
state_totals %>%
filter(case_type =="deaths_US") %>%
rename(fips = sFIPS)->
state_cases
plot_usmap(data = state_cases, values = "Current_Total", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Total Deaths", label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Total Deaths by State as of ", max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
state_totals %>%
filter(case_type =="deaths_US") %>%
rename(fips = sFIPS)->
state_cases
plot_usmap(data = state_cases, values = "Current_Total_percap",
color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Total Deaths Per Capita",
label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Total Deaths per Capita by State as of ",
... = max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "right")
state_totals %>%
filter(case_type =="deaths_US") %>%
mutate(Current_Total10 = log10(Current_Total),
Current_Total_percap10 = log10(Current_Total_percap)) %>%
rename(fips = sFIPS)->
state_cases
plot_usmap(data = state_cases, values = "Current_Total_percap10",
color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Total Deaths per Capita (Log 10)",
label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Log(10) of Total Deaths per Capita by State as of ",
... = max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "right")
state_cases %>%
left_join(states, by = c("fips"="STATE")) %>%
mutate(Current_Total_perarea = Current_Total/AREALAND)->
state_casesa
plot_usmap(data = state_casesa, values = "Current_Total_perarea",
color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Total Deaths per unit Area (Log 10)",
label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Total Deaths per unit area by State as of ",
... = max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "right")
df_all %>%
filter(Date == max(Date), Population >0, !is.na(cFIPS)) %>%
group_by(cFIPS, case_type, Date) %>%
summarise(Current_Total = sum(Daily_Total),
Current_Total_percap = sum(Daily_Total)/sum(Population))->
county_totals
county_totals %>%
filter(case_type =="confirmed_US") %>%
mutate(Current_Total_log2 = log2(Current_Total)) %>%
rename(fips = cFIPS)->
county_cases
plot_usmap(data = county_cases, include = c("NY","NJ"),
values = "Current_Total", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Confirmed Cases", label = scales::comma)+
labs(title = "US States",
subtitle = paste0("Total Cases by County as of ",
max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
plot_usmap(data = county_cases, include = dmv_county_fips,
values = "Current_Total", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Confirmed Cases", label = scales::comma)+
labs(title = "DMV Region",
subtitle = paste0("Total Cases by County/City as of ",
max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
county_totals %>%
filter(case_type =="deaths_US") %>%
rename(fips = cFIPS)->
county_deaths
plot_usmap(data = county_deaths, include = dmv_county_fips,
values = "Current_Total", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Confirmed Deaths", label = scales::comma)+
labs(title = "DMV Region",
subtitle = paste0("Total Deaths by County/City as of ",
max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
plot_usmap(data = county_cases, include = dmv_county_fips,
values = "Current_Total_percap", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Confirmed Cases Per Capita", label = scales::comma)+
labs(title = "DMV Region",
subtitle = paste0("Total Cases per Capita by County/City as of ",
max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
county_totals %>%
filter(case_type =="deaths_US") %>%
rename(fips = cFIPS)->
county_deaths
plot_usmap(data = county_deaths, include = dmv_county_fips,
values = "Current_Total_percap", color = "blue") +
scale_fill_continuous(low = "white", high = "red",
name = "Confirmed Deaths per Capita", label = scales::comma)+
labs(title = "DMV Region",
subtitle = paste0("Total Deaths per Capita by County/City as of ",
max(state_cases$Date))) +
theme(panel.background = element_rect(color = "black", fill = "white")) +
theme(legend.position = "top")
library(tidycensus)
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
##
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
##
## plot
library(sf)
library(viridis) #a color palette
## Loading required package: viridisLite
options(tigris_use_cache = TRUE)
baltimore <- get_acs(state = "MD", county = "Baltimore City", geography = "tract",
variables = "B19013_001", geometry = TRUE, key = Sys.getenv("CENSUS_API_KEY"))
## Getting data from the 2014-2018 5-year ACS
#head(baltimore)
baltimore %>%
ggplot(aes(fill = estimate)) +
geom_sf(color = NA) +
coord_sf(crs = 26911) +
scale_fill_viridis_c(option = "magma")
racevars <- c(White = "P005003",
Black = "P005004",
Asian = "P005006",
Hispanic = "P004003")
baltimore_race <- get_decennial(state = "MD", county = "Baltimore City",
geography = "tract",
variables = racevars, geometry = TRUE,
summary_var = "P001001",
key = Sys.getenv("CENSUS_API_KEY"))
## Getting data from the 2010 decennial Census
baltimore_race %>%
mutate(pct = 100 * (value / summary_value)) %>%
ggplot(aes(fill = pct)) +
facet_wrap(~variable) +
geom_sf(color = NA) +
coord_sf(crs = 26915) +
scale_fill_viridis_c()
dc_median <- get_acs(state = "DC", county = "District of Columbia", geography = "tract",
variables = "B19013_001", geometry = TRUE, key = Sys.getenv("CENSUS_API_KEY"))
## Getting data from the 2014-2018 5-year ACS
dc_median %>%
mutate(CENSUS_TRACT = str_sub(GEOID,6,11)) ->
dc_median
dc_median %>%
ggplot(aes(fill = estimate)) +
geom_sf(aes(geometry = geometry),color = NA) +
coord_sf(crs = 26915) +
scale_fill_viridis_c()
crime <- read_csv("./data/dc-crimes-search-results.csv", col_types = cols(DISTRICT = col_factor(),
WARD = col_factor(),
PSA = col_factor(),
METHOD= col_factor(),
CENSUS_TRACT = col_character()
))
crime %>%
filter(METHOD != "others") %>%
group_by(CENSUS_TRACT, METHOD) %>%
count() %>%
left_join(dc_median) %>%
ggplot(aes(fill = n)) +
geom_sf(aes(geometry = geometry),color = NA) +
coord_sf(crs = 26915) +
scale_fill_viridis_c() +
facet_wrap(~METHOD)
## Joining, by = "CENSUS_TRACT"
wards <- read_sf("./data/Ward_from_2012")
wards %>%
mutate(WARD = parse_factor(as.character(WARD))) ->
wards
# head(wards)
crime %>%
filter(METHOD != "others") %>%
group_by(WARD, METHOD) %>%
count() %>%
left_join(wards) %>%
ggplot(aes(fill = n)) +
geom_sf(aes(geometry = geometry),color = NA) +
coord_sf(crs = 26915) +
scale_fill_viridis_c() +
facet_wrap(~METHOD)
## Joining, by = "WARD"
## Warning: Column `WARD` joining factors with different levels, coercing to
## character vector
crime %>%
select(CENSUS_TRACT, WARD) %>%
group_by(CENSUS_TRACT) %>%
unique() %>%
count() %>%
filter(n>1) %>%
arrange(desc(n))
## # A tibble: 20 x 2
## # Groups: CENSUS_TRACT [20]
## CENSUS_TRACT n
## <chr> <int>
## 1 <NA> 4
## 2 004801 3
## 3 000300 2
## 4 001401 2
## 5 001402 2
## 6 002302 2
## 7 003400 2
## 8 004201 2
## 9 004300 2
## 10 004600 2
## 11 004901 2
## 12 004902 2
## 13 005800 2
## 14 005900 2
## 15 006202 2
## 16 006804 2
## 17 007601 2
## 18 007604 2
## 19 007605 2
## 20 010600 2
au_latlong <- data.frame(longitude = -77.0888, latitude = 38.9375)
au_latlong<- st_as_sf(au_latlong, coords = c("longitude", "latitude"),
crs = 4326, agr = "constant")
crime %>%
filter(METHOD != "others") %>%
group_by(WARD, METHOD) %>%
count() %>%
left_join(wards) %>%
ggplot(aes(fill = n)) +
geom_sf(aes(geometry = geometry),color = NA) +
coord_sf(crs = 26915) +
scale_fill_viridis_c() +
facet_wrap(~METHOD) +
geom_sf(data = au_latlong, size = 4, shape = 23, fill = "red")
## Joining, by = "WARD"
## Warning: Column `WARD` joining factors with different levels, coercing to
## character vector
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
dc_median %>%
ggplot(aes(fill = estimate)) +
geom_sf(aes(geometry = geometry),color = NA) +
coord_sf(crs = 26915) +
scale_fill_viridis_c() +
geom_sf(data = au_latlong, size = 4, shape = 23, fill = "red")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.